Attention Mechanism Comparison¶

In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    AutoTokenizer, AutoModel, 
    pipeline, AutoConfig
)

from datasets import load_dataset

from bertviz import model_view, head_view, neuron_view
from bertviz.transformers_neuron_view import BertModel

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
Using device: cpu
In [3]:
import html
import re

def load_sentences(n_samples=100, min_words=10, max_words=20):
    """Load sentences from AG News dataset"""
    
    dataset = load_dataset("ag_news", split="train")
    label_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    sentences = []
    domains = []
    samples_per_category = n_samples // 4
    
    for label in range(4):
        filtered = dataset.filter(lambda x: x['label'] == label)
        shuffled = filtered.shuffle(seed=42)
        count = 0
        for item in shuffled:
            if count >= samples_per_category:
                break
            # decode HTML and clean text
            text = html.unescape(item['text'])
            text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
            text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace

            # Check length
            word_count = len(text.split())
            if min_words <= word_count <= max_words and len(text) > 20:
                sentences.append(text)
                domains.append(label_names[label])
                count += 1
        
    print(f"\n Loaded {len(sentences)} sentences from AG News:")
    for label_name in label_names.values():
        count = domains.count(label_name)
        print(f"   {label_name}: {count} sentences")
    
    print(f"\n Sample sentences:")
    for i in range(min(3, len(sentences))):
        print(f"   [{domains[i]}] {sentences[i][:80]}...")
    return sentences, domains

TEST_SENTENCES, SENTENCE_DOMAINS = load_sentences(n_samples=100, min_words=10, max_words=20)
 Loaded 100 sentences from AG News:
   World: 25 sentences
   Sports: 25 sentences
   Business: 25 sentences
   Sci/Tech: 25 sentences

 Sample sentences:
   [World] Somalis vie to be new president Twenty-eight candidates are approved to contest ...
   [World] Agency pleads for hostage release Care International appeals on Arabic televisio...
   [World] Clinton recovering after heart op Former US President Bill Clinton's heart bypas...
In [4]:
MODELS = {
    "BERT": {
        "name": "bert-base-uncased",
        "description": "Original BERT",
        "color": "#FF6B6B"
    },
    "DistilBERT": {
        "name": "distilbert-base-uncased", 
        "description": "Compressed BERT",
        "color": "#4ECDC4"
    },
    "RoBERTa": {
        "name": "roberta-base",
        "description": "Optimized BERT",
        "color": "#45B7D1"
    }
}
In [5]:
def load_model_and_tokenizer(model_name):
    """
    Load model and tokenizer for a given model name.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(
        model_name,
        output_attentions=True,
        return_dict=True).to(device)
    config = AutoConfig.from_pretrained(model_name)
    print(f"Modèle {model_name} loaded")
    return model, tokenizer,config

models_data = {}

for model_name, model_info in MODELS.items():
    model, tokenizer, config = load_model_and_tokenizer(model_info["name"])
    models_data[model_name] = {
        "model": model,
        "tokenizer": tokenizer,
        "config": config,
        "description": model_info["description"],
        "color": model_info["color"]
    }
print("✅ All models loaded")
Modèle bert-base-uncased loaded
Modèle distilbert-base-uncased loaded
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Modèle roberta-base loaded
✅ All models loaded
In [6]:
def analyze_model_architecture(models_data):
    """Compare architecture of different models."""
    
    arch_data = []
    
    for model_name, data in models_data.items():
        config = data["config"]
        model = data["model"]
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        arch_info = {
            "Model": model_name,
            "Layers": config.num_hidden_layers,
            "Hidden Size": config.hidden_size,
            "Attention Heads": config.num_attention_heads,
            "Vocab Size": config.vocab_size,
            "Total Params": f"{total_params:,}",
            "Trainable Params": f"{trainable_params:,}",
        }
        arch_data.append(arch_info)
        
        print(f"\n{model_name}:")
        print(f"   Layers: {config.num_hidden_layers}")
        print(f"   Hidden Size: {config.hidden_size}")
        print(f"   Attention Heads: {config.num_attention_heads}")
        print(f"   Vocab Size: {config.vocab_size:,}")
        print(f"   Parameters: {total_params:,}")

    return pd.DataFrame(arch_data)

architecture_df = analyze_model_architecture(models_data)
print("\n Comparison table:")
display(architecture_df)
BERT:
   Layers: 12
   Hidden Size: 768
   Attention Heads: 12
   Vocab Size: 30,522
   Parameters: 109,482,240

DistilBERT:
   Layers: 6
   Hidden Size: 768
   Attention Heads: 12
   Vocab Size: 30,522
   Parameters: 66,362,880

RoBERTa:
   Layers: 12
   Hidden Size: 768
   Attention Heads: 12
   Vocab Size: 50,265
   Parameters: 124,645,632

 Comparison table:
Model Layers Hidden Size Attention Heads Vocab Size Total Params Trainable Params
0 BERT 12 768 12 30522 109,482,240 109,482,240
1 DistilBERT 6 768 12 30522 66,362,880 66,362,880
2 RoBERTa 12 768 12 50265 124,645,632 124,645,632
In [7]:
def analyze_sentence_attention(sentence, model_name, models_data):
    """Analyze attention patterns for a given sentence and model."""
    
    print(f"Analyze attention for: '{sentence}'")
    print(f"Model: {model_name}")
    
    tokenizer = models_data[model_name]["tokenizer"]
    model = models_data[model_name]["model"]
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")

    # Prediction with attention
    with torch.no_grad():
        outputs = model(**inputs)
        attentions = outputs.attentions
    
    num_layers = len(attentions)
    num_heads = attentions[0].shape[1]
    seq_length = attentions[0].shape[-1]
    print(f"Attention shape: {num_layers} layers, {num_heads} heads, {seq_length} tokens")
    
    return {
        "tokens": tokens,
        "attentions": attentions,
        "inputs": inputs,
        "num_layers": num_layers,
        "num_heads": num_heads
    }

test_sentence = TEST_SENTENCES[1]
print(f" Test on: '{test_sentence}'\n")

attention_results = {}
for model_name in models_data.keys():
    attention_results[model_name] = analyze_sentence_attention(
        test_sentence, model_name, models_data
    )
    print("-" * 50)
 Test on: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.'

Analyze attention for: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.'
Model: BERT
Tokens (25): ['[CLS]', 'agency', 'plead', '##s', 'for', 'hostage', 'release', 'care', 'international', 'appeals']...
Attention shape: 12 layers, 12 heads, 25 tokens
--------------------------------------------------
Analyze attention for: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.'
Model: DistilBERT
Tokens (25): ['[CLS]', 'agency', 'plead', '##s', 'for', 'hostage', 'release', 'care', 'international', 'appeals']...
Attention shape: 6 layers, 12 heads, 25 tokens
--------------------------------------------------
Analyze attention for: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.'
Model: RoBERTa
Tokens (26): ['<s>', 'A', 'gency', 'Ġple', 'ads', 'Ġfor', 'Ġhostage', 'Ġrelease', 'ĠCare', 'ĠInternational']...
Attention shape: 12 layers, 12 heads, 26 tokens
--------------------------------------------------
In [8]:
from IPython.display import display, HTML

def create_attention_visualizations(sentence, model_name, models_data):
    """Create attention visualizations with BertViz"""
    
    tokenizer = models_data[model_name]["tokenizer"]
    model = models_data[model_name]["model"]
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        attentions = outputs.attentions
    
    print(f"Tokens: {len(tokens)} tokens")
    print(f"Attention: {len(attentions)} layers, {attentions[0].shape[1]} heads")

    print("Model View (all attention heads):")
    model_view(attentions, tokens)

    print("Head View (per head detail):")
    head_view(attentions, tokens)
    
    return {
        "tokens": tokens,
        "attentions": attentions,
        "num_layers": len(attentions),
        "num_heads": attentions[0].shape[1]
    }

visualization_results = {}

for model_name in models_data.keys():
    print(f"\n{'='*60}")
    print(f"VISUALIZING {model_name}")
    print(f"{'='*60}")

    result = create_attention_visualizations(test_sentence, model_name, models_data)
    visualization_results[model_name] = result
============================================================
VISUALIZING BERT
============================================================
Tokens: 25 tokens
Attention: 12 layers, 12 heads
Model View (all attention heads):
Head View (per head detail):
Layer:
============================================================
VISUALIZING DistilBERT
============================================================
Tokens: 25 tokens
Attention: 6 layers, 12 heads
Model View (all attention heads):
Head View (per head detail):
Layer:
============================================================
VISUALIZING RoBERTa
============================================================
Tokens: 26 tokens
Attention: 12 layers, 12 heads
Model View (all attention heads):
Head View (per head detail):
Layer:

Understanding Attention Patterns¶

What are Layers and Heads?¶

Layers are like steps in processing - each layer refines the understanding:

  • Early layers (0-3): Focus on grammar and word relationships
  • Middle layers (4-8): Build meaning and context
  • Final layers (9-11): Aggregate information for the final output

Attention Heads decide which words are important. Think of it like each word "looking at" other words to understand context. With 12 heads per layer, the model examines different aspects simultaneously.

Reading the Visualizations¶

Model View: Shows which words pay attention to which. Brighter colors = stronger attention.

Head View: A grid showing all attention heads across all layers. Each small matrix shows the attention pattern for one head in one layer.

Comparing the Models¶

BERT - Baseline with hierarchical patterns from syntax to semantics

DistilBERT - Compressed to 6 layers but maintains effectiveness with more focused patterns

RoBERTa - Optimized training leads to cleaner, more targeted attention patterns

In [9]:
def analyze_attention_patterns(sentence, models_data):
    """Analyze attention patterns for different linguistic phenomena"""
    
    print(f"ANALYSE PATTERNS: '{sentence}'")
    print("=" * 60)
    
    patterns_analysis = {}
    
    for model_name, data in models_data.items():
        print(f"\n{model_name}:")
        
        tokenizer = data["tokenizer"]
        model = data["model"]
        inputs = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=False)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        
        with torch.no_grad():
            outputs = model(**inputs)
            attentions = outputs.attentions
        
        last_attention = attentions[-1][0]  # [heads, seq_len, seq_len]
        
        # 1. Attention to [CLS] token
        cls_attention = last_attention[:, 0, :].mean(dim=0)  # Mean over heads
        
        # 2. Self-attention (attention of a token to itself)
        self_attention = torch.diagonal(last_attention.mean(dim=0))

        # 3. Attention to content words vs function words
        content_words = []
        function_words = []
        for i, token in enumerate(tokens):
            if token.startswith('##') or token in ['[CLS]', '[SEP]', '[PAD]']:
                continue
            elif token.lower() in ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'on', 'at']:
                function_words.append(i)
            else:
                content_words.append(i)
        
        avg_attention_to_content = last_attention.mean(dim=0)[:, content_words].mean() if content_words else 0
        avg_attention_to_function = last_attention.mean(dim=0)[:, function_words].mean() if function_words else 0
        
        patterns = {
            "cls_attention_max": float(cls_attention.max()),
            "cls_attention_mean": float(cls_attention.mean()),
            "self_attention_mean": float(self_attention.mean()),
            "content_vs_function": float(avg_attention_to_content / max(avg_attention_to_function, 0.001)),
            "attention_entropy": float(-torch.sum(last_attention.mean(dim=0) * torch.log(last_attention.mean(dim=0) + 1e-10)).mean())
        }
        
        patterns_analysis[model_name] = patterns

        print(f"   CLS attention max: {patterns['cls_attention_max']:.3f}")
        print(f"   Self-attention mean: {patterns['self_attention_mean']:.3f}")
        print(f"   Content/Function ratio: {patterns['content_vs_function']:.2f}")
        print(f"   Attention entropy: {patterns['attention_entropy']:.3f}")

    return patterns_analysis


all_patterns = {}
for i, sentence in enumerate(TEST_SENTENCES):
    print(f"\n{'='*20} SENTENCE {i+1} {'='*20}")
    patterns = analyze_attention_patterns(sentence, models_data)
    all_patterns[f"Sentence_{i+1}"] = patterns
==================== SENTENCE 1 ====================
ANALYSE PATTERNS: 'Somalis vie to be new president Twenty-eight candidates are approved to contest next week's elections to be Somalia's new leader.'
============================================================

BERT:
   CLS attention max: 0.108
   Self-attention mean: 0.088
   Content/Function ratio: 1.35
   Attention entropy: 61.730

DistilBERT:
   CLS attention max: 0.171
   Self-attention mean: 0.070
   Content/Function ratio: 0.89
   Attention entropy: 62.514

RoBERTa:
   CLS attention max: 0.281
   Self-attention mean: 0.128
   Content/Function ratio: 3.81
   Attention entropy: 60.199

==================== SENTENCE 2 ====================
ANALYSE PATTERNS: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.'
============================================================

BERT:
   CLS attention max: 0.102
   Self-attention mean: 0.112
   Content/Function ratio: 1.59
   Attention entropy: 53.609

DistilBERT:
   CLS attention max: 0.196
   Self-attention mean: 0.083
   Content/Function ratio: 2.18
   Attention entropy: 50.663

RoBERTa:
   CLS attention max: 0.270
   Self-attention mean: 0.128
   Content/Function ratio: 1.50
   Attention entropy: 53.205

==================== SENTENCE 3 ====================
ANALYSE PATTERNS: 'Clinton recovering after heart op Former US President Bill Clinton's heart bypass operation is successful, say doctors in New York.'
============================================================

BERT:
   CLS attention max: 0.104
   Self-attention mean: 0.116
   Content/Function ratio: 0.76
   Attention entropy: 49.653

DistilBERT:
   CLS attention max: 0.183
   Self-attention mean: 0.085
   Content/Function ratio: 1.00
   Attention entropy: 54.519

RoBERTa:
   CLS attention max: 0.259
   Self-attention mean: 0.121
   Content/Function ratio: 40.00
   Attention entropy: 51.660

==================== SENTENCE 4 ====================
ANALYSE PATTERNS: 'Tired of post-9/11 hassles, Arab tourists head east Saudi visitors to Malaysia were up 53 percent in 2004.'
============================================================

BERT:
   CLS attention max: 0.107
   Self-attention mean: 0.102
   Content/Function ratio: 2.16
   Attention entropy: 63.509

DistilBERT:
   CLS attention max: 0.124
   Self-attention mean: 0.078
   Content/Function ratio: 2.13
   Attention entropy: 62.896

RoBERTa:
   CLS attention max: 0.260
   Self-attention mean: 0.142
   Content/Function ratio: 34.48
   Attention entropy: 58.007

==================== SENTENCE 5 ====================
ANALYSE PATTERNS: 'Greenspan warns over US deficit Federal Reserve chief Alan Greenspan says the US trade deficit cannot be sustained indefinitely.'
============================================================

BERT:
   CLS attention max: 0.199
   Self-attention mean: 0.116
   Content/Function ratio: 2.66
   Attention entropy: 46.574

DistilBERT:
   CLS attention max: 0.241
   Self-attention mean: 0.089
   Content/Function ratio: 1.86
   Attention entropy: 45.872

RoBERTa:
   CLS attention max: 0.281
   Self-attention mean: 0.141
   Content/Function ratio: 40.00
   Attention entropy: 51.452

==================== SENTENCE 6 ====================
ANALYSE PATTERNS: 'Landmine kills Afghan policemen Seven Afghan policemen are killed by a landmine in the Kandahar province, ahead of landmark elections.'
============================================================

BERT:
   CLS attention max: 0.106
   Self-attention mean: 0.097
   Content/Function ratio: 2.36
   Attention entropy: 62.710

DistilBERT:
   CLS attention max: 0.142
   Self-attention mean: 0.085
   Content/Function ratio: 2.64
   Attention entropy: 63.065

RoBERTa:
   CLS attention max: 0.272
   Self-attention mean: 0.115
   Content/Function ratio: 37.04
   Attention entropy: 55.344

==================== SENTENCE 7 ====================
ANALYSE PATTERNS: 'Ghana votes in presidential poll Presidential elections are taking place in Ghana, which boasts political stability but much poverty.'
============================================================

BERT:
   CLS attention max: 0.176
   Self-attention mean: 0.112
   Content/Function ratio: 2.83
   Attention entropy: 48.111

DistilBERT:
   CLS attention max: 0.135
   Self-attention mean: 0.083
   Content/Function ratio: 2.47
   Attention entropy: 48.360

RoBERTa:
   CLS attention max: 0.267
   Self-attention mean: 0.142
   Content/Function ratio: 41.67
   Attention entropy: 50.312

==================== SENTENCE 8 ====================
ANALYSE PATTERNS: 'New Zimbabwe restrictions target aid groups A proposed law would limit foreign funding of churches and AIDS programs.'
============================================================

BERT:
   CLS attention max: 0.158
   Self-attention mean: 0.129
   Content/Function ratio: 1.61
   Attention entropy: 41.358

DistilBERT:
   CLS attention max: 0.175
   Self-attention mean: 0.088
   Content/Function ratio: 1.97
   Attention entropy: 40.642

RoBERTa:
   CLS attention max: 0.277
   Self-attention mean: 0.118
   Content/Function ratio: 47.62
   Attention entropy: 42.179

==================== SENTENCE 9 ====================
ANALYSE PATTERNS: 'German trial stirs torture debate An ex-police officer goes on trial in Germany charged with threatening a suspect with torture.'
============================================================

BERT:
   CLS attention max: 0.119
   Self-attention mean: 0.107
   Content/Function ratio: 2.05
   Attention entropy: 54.029

DistilBERT:
   CLS attention max: 0.193
   Self-attention mean: 0.086
   Content/Function ratio: 2.28
   Attention entropy: 54.654

RoBERTa:
   CLS attention max: 0.277
   Self-attention mean: 0.132
   Content/Function ratio: 38.46
   Attention entropy: 51.046

==================== SENTENCE 10 ====================
ANALYSE PATTERNS: 'Painkiller risk to gut revealed The risk of intestinal damage from common painkillers may be higher than thought, research suggests.'
============================================================

BERT:
   CLS attention max: 0.144
   Self-attention mean: 0.109
   Content/Function ratio: 2.25
   Attention entropy: 63.005

DistilBERT:
   CLS attention max: 0.176
   Self-attention mean: 0.082
   Content/Function ratio: 2.47
   Attention entropy: 62.340

RoBERTa:
   CLS attention max: 0.263
   Self-attention mean: 0.123
   Content/Function ratio: 38.46
   Attention entropy: 50.159

==================== SENTENCE 11 ====================
ANALYSE PATTERNS: 'Fighting rages in South Ossetia Heavy fighting erupts in Georgia's breakaway South Ossetia region, shattering a two-day ceasefire.'
============================================================

BERT:
   CLS attention max: 0.145
   Self-attention mean: 0.079
   Content/Function ratio: 4.34
   Attention entropy: 65.446

DistilBERT:
   CLS attention max: 0.150
   Self-attention mean: 0.064
   Content/Function ratio: 2.69
   Attention entropy: 74.924

RoBERTa:
   CLS attention max: 0.267
   Self-attention mean: 0.110
   Content/Function ratio: 31.25
   Attention entropy: 67.078

==================== SENTENCE 12 ====================
ANALYSE PATTERNS: ''Few ready' for information act Public bodies are ill-prepared for the Freedom of Information Act, says a group of MPs.'
============================================================

BERT:
   CLS attention max: 0.089
   Self-attention mean: 0.102
   Content/Function ratio: 2.80
   Attention entropy: 56.225

DistilBERT:
   CLS attention max: 0.184
   Self-attention mean: 0.089
   Content/Function ratio: 1.95
   Attention entropy: 58.090

RoBERTa:
   CLS attention max: 0.250
   Self-attention mean: 0.146
   Content/Function ratio: 34.48
   Attention entropy: 60.050

==================== SENTENCE 13 ====================
ANALYSE PATTERNS: 'Charges over Montenegro killing Montenegro prosecutors charge a former karate champion over the death of a newspaper editor.'
============================================================

BERT:
   CLS attention max: 0.115
   Self-attention mean: 0.115
   Content/Function ratio: 1.61
   Attention entropy: 41.917

DistilBERT:
   CLS attention max: 0.211
   Self-attention mean: 0.097
   Content/Function ratio: 1.89
   Attention entropy: 42.223

RoBERTa:
   CLS attention max: 0.299
   Self-attention mean: 0.138
   Content/Function ratio: 41.67
   Attention entropy: 49.930

==================== SENTENCE 14 ====================
ANALYSE PATTERNS: ''Distressed' Thatcher flies home Baroness Thatcher returns home as it emerges her son Sir Mark could face extradition proceedings.'
============================================================

BERT:
   CLS attention max: 0.166
   Self-attention mean: 0.101
   Content/Function ratio: 29.91
   Attention entropy: 45.584

DistilBERT:
   CLS attention max: 0.223
   Self-attention mean: 0.084
   Content/Function ratio: 34.17
   Attention entropy: 50.705

RoBERTa:
   CLS attention max: 0.276
   Self-attention mean: 0.150
   Content/Function ratio: 38.46
   Attention entropy: 49.948

==================== SENTENCE 15 ====================
ANALYSE PATTERNS: 'U.S. Marine Killed in Anbar Province U.S. Marine killed in Iraq's Anbar provinceBC-Iraq-Military Death,0115'
============================================================

BERT:
   CLS attention max: 0.093
   Self-attention mean: 0.094
   Content/Function ratio: 2.88
   Attention entropy: 89.299

DistilBERT:
   CLS attention max: 0.115
   Self-attention mean: 0.066
   Content/Function ratio: 3.25
   Attention entropy: 86.616

RoBERTa:
   CLS attention max: 0.289
   Self-attention mean: 0.107
   Content/Function ratio: 30.30
   Attention entropy: 67.425

==================== SENTENCE 16 ====================
ANALYSE PATTERNS: 'Alcohol hampers depth perception Drinking alcohol impairs driving ability by disrupting depth perception, researchers find.'
============================================================

BERT:
   CLS attention max: 0.118
   Self-attention mean: 0.103
   Content/Function ratio: 33.39
   Attention entropy: 41.201

DistilBERT:
   CLS attention max: 0.229
   Self-attention mean: 0.100
   Content/Function ratio: 27.23
   Attention entropy: 47.713

RoBERTa:
   CLS attention max: 0.292
   Self-attention mean: 0.122
   Content/Function ratio: 43.48
   Attention entropy: 44.052

==================== SENTENCE 17 ====================
ANALYSE PATTERNS: 'Table tennis: Gold for China Zhang Yining beats North Korea's Kim Hyang-Mi to win the table tennis women's singles.'
============================================================

BERT:
   CLS attention max: 0.132
   Self-attention mean: 0.094
   Content/Function ratio: 2.06
   Attention entropy: 66.171

DistilBERT:
   CLS attention max: 0.155
   Self-attention mean: 0.070
   Content/Function ratio: 2.29
   Attention entropy: 66.386

RoBERTa:
   CLS attention max: 0.269
   Self-attention mean: 0.112
   Content/Function ratio: 34.48
   Attention entropy: 58.151

==================== SENTENCE 18 ====================
ANALYSE PATTERNS: 'Swiss 'reject' citizenship reform Swiss voters appear to have rejected proposals to relax the country's strict naturalisation laws.'
============================================================

BERT:
   CLS attention max: 0.151
   Self-attention mean: 0.095
   Content/Function ratio: 2.18
   Attention entropy: 47.602

DistilBERT:
   CLS attention max: 0.201
   Self-attention mean: 0.080
   Content/Function ratio: 1.91
   Attention entropy: 51.887

RoBERTa:
   CLS attention max: 0.276
   Self-attention mean: 0.137
   Content/Function ratio: 37.04
   Attention entropy: 53.779

==================== SENTENCE 19 ====================
ANALYSE PATTERNS: 'Maradona 'can be treated abroad' Former football star Diego Maradona can return to Cuba for drug rehabilitation, his lawyer says.'
============================================================

BERT:
   CLS attention max: 0.120
   Self-attention mean: 0.092
   Content/Function ratio: 25.88
   Attention entropy: 61.333

DistilBERT:
   CLS attention max: 0.174
   Self-attention mean: 0.083
   Content/Function ratio: 30.65
   Attention entropy: 64.968

RoBERTa:
   CLS attention max: 0.279
   Self-attention mean: 0.127
   Content/Function ratio: 33.33
   Attention entropy: 62.869

==================== SENTENCE 20 ====================
ANALYSE PATTERNS: 'Cambodia set to crown new king Cambodians prepare for the coronation of King Sihamoni, amid an array of official festivities.'
============================================================

BERT:
   CLS attention max: 0.106
   Self-attention mean: 0.087
   Content/Function ratio: 3.80
   Attention entropy: 58.372

DistilBERT:
   CLS attention max: 0.141
   Self-attention mean: 0.080
   Content/Function ratio: 3.51
   Attention entropy: 60.281

RoBERTa:
   CLS attention max: 0.266
   Self-attention mean: 0.119
   Content/Function ratio: 32.26
   Attention entropy: 61.734

==================== SENTENCE 21 ====================
ANALYSE PATTERNS: 'Observers approve Afghan election International observers say calls to annul the Afghan presidential poll on grounds of fraud are unjustified.'
============================================================

BERT:
   CLS attention max: 0.122
   Self-attention mean: 0.099
   Content/Function ratio: 2.11
   Attention entropy: 52.553

DistilBERT:
   CLS attention max: 0.205
   Self-attention mean: 0.080
   Content/Function ratio: 2.58
   Attention entropy: 51.205

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.129
   Content/Function ratio: 37.04
   Attention entropy: 53.768

==================== SENTENCE 22 ====================
ANALYSE PATTERNS: 'Burma crackdown on luxury cars Burma investigates illegally imported luxury cars, the latest repercussion of former PM Khin Nyunt's ouster.'
============================================================

BERT:
   CLS attention max: 0.129
   Self-attention mean: 0.096
   Content/Function ratio: 1.99
   Attention entropy: 69.674

DistilBERT:
   CLS attention max: 0.213
   Self-attention mean: 0.078
   Content/Function ratio: 1.96
   Attention entropy: 70.122

RoBERTa:
   CLS attention max: 0.265
   Self-attention mean: 0.131
   Content/Function ratio: 2.37
   Attention entropy: 58.381

==================== SENTENCE 23 ====================
ANALYSE PATTERNS: 'Turkish hostages 'killed' in Iraq An Iraqi militant group kills three Turkish hostages, reports Arabic TV station al-Jazeera.'
============================================================

BERT:
   CLS attention max: 0.098
   Self-attention mean: 0.114
   Content/Function ratio: 1.30
   Attention entropy: 56.145

DistilBERT:
   CLS attention max: 0.175
   Self-attention mean: 0.081
   Content/Function ratio: 1.31
   Attention entropy: 57.362

RoBERTa:
   CLS attention max: 0.282
   Self-attention mean: 0.112
   Content/Function ratio: 37.04
   Attention entropy: 53.194

==================== SENTENCE 24 ====================
ANALYSE PATTERNS: 'Home users get Windows update Microsoft is making its important security update for Windows XP available on auto-update servers today.'
============================================================

BERT:
   CLS attention max: 0.132
   Self-attention mean: 0.124
   Content/Function ratio: 2.20
   Attention entropy: 51.523

DistilBERT:
   CLS attention max: 0.221
   Self-attention mean: 0.085
   Content/Function ratio: 1.87
   Attention entropy: 52.584

RoBERTa:
   CLS attention max: 0.259
   Self-attention mean: 0.133
   Content/Function ratio: 40.00
   Attention entropy: 56.743

==================== SENTENCE 25 ====================
ANALYSE PATTERNS: 'Chinese firm buys IBM PC business IBM is selling its PC hardware business to number one Chinese computer maker Lenovo.'
============================================================

BERT:
   CLS attention max: 0.146
   Self-attention mean: 0.123
   Content/Function ratio: 1.92
   Attention entropy: 47.939

DistilBERT:
   CLS attention max: 0.215
   Self-attention mean: 0.088
   Content/Function ratio: 1.42
   Attention entropy: 45.450

RoBERTa:
   CLS attention max: 0.299
   Self-attention mean: 0.132
   Content/Function ratio: 43.48
   Attention entropy: 46.955

==================== SENTENCE 26 ====================
ANALYSE PATTERNS: 'Today's schedule Pro basketball: WNBA playoffs: Sun vs. Washington (Game 3) at Mohegan Sun Arena, Uncasville, Conn., 8 p.m.'
============================================================

BERT:
   CLS attention max: 0.085
   Self-attention mean: 0.091
   Content/Function ratio: 2.85
   Attention entropy: 102.113

DistilBERT:
   CLS attention max: 0.105
   Self-attention mean: 0.062
   Content/Function ratio: 3.46
   Attention entropy: 105.811

RoBERTa:
   CLS attention max: 0.267
   Self-attention mean: 0.116
   Content/Function ratio: 26.32
   Attention entropy: 83.911

==================== SENTENCE 27 ====================
ANALYSE PATTERNS: 'NFL Pass-Interference Crackwon Draws Fire (AP) AP - Darren Sharper is upset about the NFL's crackdown on pass interference.'
============================================================

BERT:
   CLS attention max: 0.146
   Self-attention mean: 0.089
   Content/Function ratio: 2.28
   Attention entropy: 69.033

DistilBERT:
   CLS attention max: 0.181
   Self-attention mean: 0.076
   Content/Function ratio: 1.27
   Attention entropy: 68.324

RoBERTa:
   CLS attention max: 0.241
   Self-attention mean: 0.137
   Content/Function ratio: 32.26
   Attention entropy: 61.675

==================== SENTENCE 28 ====================
ANALYSE PATTERNS: 'Transactions BASEBALL Arizona (NL): Signed a two-year player development agreement with Tennessee (Southern).'
============================================================

BERT:
   CLS attention max: 0.108
   Self-attention mean: 0.112
   Content/Function ratio: 3.05
   Attention entropy: 46.106

DistilBERT:
   CLS attention max: 0.170
   Self-attention mean: 0.091
   Content/Function ratio: 1.98
   Attention entropy: 52.840

RoBERTa:
   CLS attention max: 0.343
   Self-attention mean: 0.171
   Content/Function ratio: 43.48
   Attention entropy: 46.249

==================== SENTENCE 29 ====================
ANALYSE PATTERNS: 'Rams Roll Past Redskins 28-3 (AP) AP - Torry Holt and the St. Louis Rams finally had something to celebrate.'
============================================================

BERT:
   CLS attention max: 0.110
   Self-attention mean: 0.089
   Content/Function ratio: 3.63
   Attention entropy: 59.903

DistilBERT:
   CLS attention max: 0.165
   Self-attention mean: 0.081
   Content/Function ratio: 1.52
   Attention entropy: 66.351

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.138
   Content/Function ratio: 33.33
   Attention entropy: 58.415

==================== SENTENCE 30 ====================
ANALYSE PATTERNS: 'Quincy gets its revenge It took a year, but Quincy's volleyball team has bragging rights in the city again.'
============================================================

BERT:
   CLS attention max: 0.116
   Self-attention mean: 0.099
   Content/Function ratio: 3.37
   Attention entropy: 57.322

DistilBERT:
   CLS attention max: 0.161
   Self-attention mean: 0.087
   Content/Function ratio: 2.74
   Attention entropy: 61.465

RoBERTa:
   CLS attention max: 0.278
   Self-attention mean: 0.144
   Content/Function ratio: 2.06
   Attention entropy: 52.182

==================== SENTENCE 31 ====================
ANALYSE PATTERNS: 'Macey surges into fourth Britain's Dean Macey lies in fourth place after five events of the decathlon.'
============================================================

BERT:
   CLS attention max: 0.118
   Self-attention mean: 0.114
   Content/Function ratio: 3.06
   Attention entropy: 51.432

DistilBERT:
   CLS attention max: 0.228
   Self-attention mean: 0.090
   Content/Function ratio: 2.68
   Attention entropy: 53.708

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.148
   Content/Function ratio: 41.67
   Attention entropy: 44.847

==================== SENTENCE 32 ====================
ANALYSE PATTERNS: 'Ruffin Grabs Attention Forward Michael Ruffin is impressing Wizards coaches with his rebounding and rough play under the basket.'
============================================================

BERT:
   CLS attention max: 0.137
   Self-attention mean: 0.109
   Content/Function ratio: 1.98
   Attention entropy: 48.097

DistilBERT:
   CLS attention max: 0.224
   Self-attention mean: 0.092
   Content/Function ratio: 1.33
   Attention entropy: 56.100

RoBERTa:
   CLS attention max: 0.259
   Self-attention mean: 0.137
   Content/Function ratio: 2.26
   Attention entropy: 55.430

==================== SENTENCE 33 ====================
ANALYSE PATTERNS: 'Rebels Target Erickson Ole Miss has received permission from the San Francisco 49ers to speak with head coach Dennis Erickson.'
============================================================

BERT:
   CLS attention max: 0.121
   Self-attention mean: 0.118
   Content/Function ratio: 2.24
   Attention entropy: 46.629

DistilBERT:
   CLS attention max: 0.178
   Self-attention mean: 0.091
   Content/Function ratio: 2.49
   Attention entropy: 50.740

RoBERTa:
   CLS attention max: 0.272
   Self-attention mean: 0.120
   Content/Function ratio: 37.04
   Attention entropy: 60.282

==================== SENTENCE 34 ====================
ANALYSE PATTERNS: 'US NBA players become the Nightmare Team after epic loss (AFP) AFP - Call them the "Nightmare Team".'
============================================================

BERT:
   CLS attention max: 0.086
   Self-attention mean: 0.117
   Content/Function ratio: 2.18
   Attention entropy: 57.136

DistilBERT:
   CLS attention max: 0.183
   Self-attention mean: 0.094
   Content/Function ratio: 1.51
   Attention entropy: 62.509

RoBERTa:
   CLS attention max: 0.291
   Self-attention mean: 0.140
   Content/Function ratio: 40.00
   Attention entropy: 50.283

==================== SENTENCE 35 ====================
ANALYSE PATTERNS: 'Sportsview: Eagles Have Attitude, Talent (AP) AP - The Philadelphia Eagles had talent. Now they have swagger and personality, too.'
============================================================

BERT:
   CLS attention max: 0.081
   Self-attention mean: 0.097
   Content/Function ratio: 3.19
   Attention entropy: 78.178

DistilBERT:
   CLS attention max: 0.109
   Self-attention mean: 0.085
   Content/Function ratio: 2.05
   Attention entropy: 78.285

RoBERTa:
   CLS attention max: 0.186
   Self-attention mean: 0.111
   Content/Function ratio: 31.25
   Attention entropy: 73.647

==================== SENTENCE 36 ====================
ANALYSE PATTERNS: 'Colts' Freeney Rushes to Get Most Sacks (AP) AP - Dwight Freeney always has gotten the attention of offenses.'
============================================================

BERT:
   CLS attention max: 0.102
   Self-attention mean: 0.115
   Content/Function ratio: 3.15
   Attention entropy: 58.072

DistilBERT:
   CLS attention max: 0.177
   Self-attention mean: 0.086
   Content/Function ratio: 2.12
   Attention entropy: 60.464

RoBERTa:
   CLS attention max: 0.263
   Self-attention mean: 0.127
   Content/Function ratio: 31.25
   Attention entropy: 66.365

==================== SENTENCE 37 ====================
ANALYSE PATTERNS: 'Greek pair await IAAF fate Kostas Kenteris and Katerina Thanou's missed drugs tests will be investigated by the IAAF.'
============================================================

BERT:
   CLS attention max: 0.131
   Self-attention mean: 0.102
   Content/Function ratio: 3.29
   Attention entropy: 56.117

DistilBERT:
   CLS attention max: 0.202
   Self-attention mean: 0.074
   Content/Function ratio: 3.05
   Attention entropy: 54.536

RoBERTa:
   CLS attention max: 0.244
   Self-attention mean: 0.117
   Content/Function ratio: 3.44
   Attention entropy: 63.883

==================== SENTENCE 38 ====================
ANALYSE PATTERNS: 'Barnstable meets challenge The Barnstable girls' volleyball team was looking for challenging nonleague opponents before the state tournament.'
============================================================

BERT:
   CLS attention max: 0.112
   Self-attention mean: 0.108
   Content/Function ratio: 2.46
   Attention entropy: 48.648

DistilBERT:
   CLS attention max: 0.205
   Self-attention mean: 0.093
   Content/Function ratio: 2.18
   Attention entropy: 50.728

RoBERTa:
   CLS attention max: 0.264
   Self-attention mean: 0.136
   Content/Function ratio: 40.00
   Attention entropy: 50.007

==================== SENTENCE 39 ====================
ANALYSE PATTERNS: 'SI.com ST. LOUIS (Ticker) -- The Cincinnati Reds continue to find new ways to lose to the St. Louis Cardinals.'
============================================================

BERT:
   CLS attention max: 0.084
   Self-attention mean: 0.095
   Content/Function ratio: 3.87
   Attention entropy: 70.721

DistilBERT:
   CLS attention max: 0.103
   Self-attention mean: 0.078
   Content/Function ratio: 1.85
   Attention entropy: 83.986

RoBERTa:
   CLS attention max: 0.263
   Self-attention mean: 0.110
   Content/Function ratio: 1.73
   Attention entropy: 65.946

==================== SENTENCE 40 ====================
ANALYSE PATTERNS: 'Tyson Completes Service Charges stemming from a 2003 altercation are dropped as Mike Tyson completes community service on Wednesday.'
============================================================

BERT:
   CLS attention max: 0.114
   Self-attention mean: 0.107
   Content/Function ratio: 2.47
   Attention entropy: 41.588

DistilBERT:
   CLS attention max: 0.187
   Self-attention mean: 0.096
   Content/Function ratio: 1.83
   Attention entropy: 48.059

RoBERTa:
   CLS attention max: 0.279
   Self-attention mean: 0.149
   Content/Function ratio: 40.00
   Attention entropy: 50.392

==================== SENTENCE 41 ====================
ANALYSE PATTERNS: 'Transactions BASEBALL Cleveland (AL): Sold INF Erick Almonte to Nippon (Japan). New York (AL): Signed P Tanyon Sturtze.'
============================================================

BERT:
   CLS attention max: 0.082
   Self-attention mean: 0.096
   Content/Function ratio: 2.84
   Attention entropy: 85.574

DistilBERT:
   CLS attention max: 0.110
   Self-attention mean: 0.077
   Content/Function ratio: 3.54
   Attention entropy: 95.095

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.114
   Content/Function ratio: 3.96
   Attention entropy: 75.298

==================== SENTENCE 42 ====================
ANALYSE PATTERNS: 'Transactions BASEBALL Seattle (AL): Named Mike Hargrove manager and agreed to terms on a three-year contract.'
============================================================

BERT:
   CLS attention max: 0.098
   Self-attention mean: 0.100
   Content/Function ratio: 3.94
   Attention entropy: 54.168

DistilBERT:
   CLS attention max: 0.146
   Self-attention mean: 0.083
   Content/Function ratio: 2.30
   Attention entropy: 58.925

RoBERTa:
   CLS attention max: 0.286
   Self-attention mean: 0.135
   Content/Function ratio: 35.71
   Attention entropy: 56.673

==================== SENTENCE 43 ====================
ANALYSE PATTERNS: 'British eventers slip back Great Britain slip down to third after the cross-country round of the three-day eventing.'
============================================================

BERT:
   CLS attention max: 0.107
   Self-attention mean: 0.101
   Content/Function ratio: 2.08
   Attention entropy: 56.799

DistilBERT:
   CLS attention max: 0.237
   Self-attention mean: 0.084
   Content/Function ratio: 1.77
   Attention entropy: 57.910

RoBERTa:
   CLS attention max: 0.268
   Self-attention mean: 0.133
   Content/Function ratio: 37.04
   Attention entropy: 54.724

==================== SENTENCE 44 ====================
ANALYSE PATTERNS: 'Final Preseason Game Important for McMahon (AP) AP - Don't tell Mike McMahon the NFL's final exhibitions are meaningless.'
============================================================

BERT:
   CLS attention max: 0.100
   Self-attention mean: 0.099
   Content/Function ratio: 2.34
   Attention entropy: 66.570

DistilBERT:
   CLS attention max: 0.140
   Self-attention mean: 0.079
   Content/Function ratio: 1.57
   Attention entropy: 66.973

RoBERTa:
   CLS attention max: 0.261
   Self-attention mean: 0.132
   Content/Function ratio: 37.04
   Attention entropy: 56.080

==================== SENTENCE 45 ====================
ANALYSE PATTERNS: 'Phelps' Trial Set 19-year-old Olympic swimming champion Michael Phelps' drunken driving trial is set for Dec. 29.'
============================================================

BERT:
   CLS attention max: 0.102
   Self-attention mean: 0.104
   Content/Function ratio: 1.54
   Attention entropy: 52.102

DistilBERT:
   CLS attention max: 0.172
   Self-attention mean: 0.090
   Content/Function ratio: 0.98
   Attention entropy: 62.051

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.146
   Content/Function ratio: 35.71
   Attention entropy: 56.992

==================== SENTENCE 46 ====================
ANALYSE PATTERNS: 'NL notables The Mets' Jeff Keppinger got his first major league steal in the seventh, swiping second.'
============================================================

BERT:
   CLS attention max: 0.095
   Self-attention mean: 0.091
   Content/Function ratio: 3.65
   Attention entropy: 54.018

DistilBERT:
   CLS attention max: 0.200
   Self-attention mean: 0.083
   Content/Function ratio: 1.66
   Attention entropy: 58.853

RoBERTa:
   CLS attention max: 0.291
   Self-attention mean: 0.112
   Content/Function ratio: 38.46
   Attention entropy: 48.166

==================== SENTENCE 47 ====================
ANALYSE PATTERNS: 'Sooners stop at nothing DALLAS -- The recruiting battle was as intense as everyone expected. Oklahoma against Texas -- again.'
============================================================

BERT:
   CLS attention max: 0.123
   Self-attention mean: 0.104
   Content/Function ratio: 1.78
   Attention entropy: 51.506

DistilBERT:
   CLS attention max: 0.232
   Self-attention mean: 0.090
   Content/Function ratio: 0.92
   Attention entropy: 54.546

RoBERTa:
   CLS attention max: 0.261
   Self-attention mean: 0.125
   Content/Function ratio: 37.04
   Attention entropy: 54.345

==================== SENTENCE 48 ====================
ANALYSE PATTERNS: 'Today's schedule Amateur baseball: Yawkey League playoffs -- South Boston vs. Somerville at Ronan Park, Dorchester, 7:30 p.m.'
============================================================

BERT:
   CLS attention max: 0.108
   Self-attention mean: 0.091
   Content/Function ratio: 3.15
   Attention entropy: 78.316

DistilBERT:
   CLS attention max: 0.173
   Self-attention mean: 0.071
   Content/Function ratio: 1.90
   Attention entropy: 77.828

RoBERTa:
   CLS attention max: 0.272
   Self-attention mean: 0.106
   Content/Function ratio: 2.80
   Attention entropy: 73.250

==================== SENTENCE 49 ====================
ANALYSE PATTERNS: 'Today's schedule College hockey: MEN -- Worcester St. at Wentworth, 8 p.m.; WOMEN -- Rensselaer at MIT, 7 p.m.'
============================================================

BERT:
   CLS attention max: 0.071
   Self-attention mean: 0.088
   Content/Function ratio: 1.50
   Attention entropy: 91.062

DistilBERT:
   CLS attention max: 0.160
   Self-attention mean: 0.072
   Content/Function ratio: 1.51
   Attention entropy: 89.079

RoBERTa:
   CLS attention max: 0.282
   Self-attention mean: 0.100
   Content/Function ratio: 27.03
   Attention entropy: 75.337

==================== SENTENCE 50 ====================
ANALYSE PATTERNS: 'Martinez Deal Finalized Martinez passes his physical, and the Mets finalize their \$53 million, four-year contract with the pitcher.'
============================================================

BERT:
   CLS attention max: 0.114
   Self-attention mean: 0.092
   Content/Function ratio: 4.21
   Attention entropy: 59.085

DistilBERT:
   CLS attention max: 0.180
   Self-attention mean: 0.078
   Content/Function ratio: 2.12
   Attention entropy: 68.812

RoBERTa:
   CLS attention max: 0.259
   Self-attention mean: 0.150
   Content/Function ratio: 32.26
   Attention entropy: 61.319

==================== SENTENCE 51 ====================
ANALYSE PATTERNS: 'Red Bull snaps up Jaguar F1 team Energy drink company Red Bull has bought the Jaguar Formula One team.'
============================================================

BERT:
   CLS attention max: 0.133
   Self-attention mean: 0.122
   Content/Function ratio: 1.44
   Attention entropy: 45.058

DistilBERT:
   CLS attention max: 0.237
   Self-attention mean: 0.091
   Content/Function ratio: 1.54
   Attention entropy: 42.626

RoBERTa:
   CLS attention max: 0.307
   Self-attention mean: 0.143
   Content/Function ratio: 43.48
   Attention entropy: 45.069

==================== SENTENCE 52 ====================
ANALYSE PATTERNS: 'Dell's Secret Earnings Engine The company gets its highest profit margins from a conspicuously old economy business.'
============================================================

BERT:
   CLS attention max: 0.130
   Self-attention mean: 0.107
   Content/Function ratio: 1.74
   Attention entropy: 46.423

DistilBERT:
   CLS attention max: 0.216
   Self-attention mean: 0.081
   Content/Function ratio: 1.62
   Attention entropy: 46.890

RoBERTa:
   CLS attention max: 0.269
   Self-attention mean: 0.132
   Content/Function ratio: 41.67
   Attention entropy: 48.691

==================== SENTENCE 53 ====================
ANALYSE PATTERNS: 'Which Medications Are Your Best Bets? Consumer Reports' effort to rate drugs offers a lesson to pharmaceutical companies.'
============================================================

BERT:
   CLS attention max: 0.111
   Self-attention mean: 0.101
   Content/Function ratio: 2.96
   Attention entropy: 43.126

DistilBERT:
   CLS attention max: 0.188
   Self-attention mean: 0.091
   Content/Function ratio: 2.39
   Attention entropy: 46.230

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.139
   Content/Function ratio: 40.00
   Attention entropy: 50.162

==================== SENTENCE 54 ====================
ANALYSE PATTERNS: 'M'm! M'm! Could Be Better! Campbell Soup turns in a good quarter, but there are better alternatives.'
============================================================

BERT:
   CLS attention max: 0.121
   Self-attention mean: 0.103
   Content/Function ratio: 3.56
   Attention entropy: 57.247

DistilBERT:
   CLS attention max: 0.167
   Self-attention mean: 0.076
   Content/Function ratio: 2.43
   Attention entropy: 64.630

RoBERTa:
   CLS attention max: 0.270
   Self-attention mean: 0.139
   Content/Function ratio: 38.46
   Attention entropy: 50.044

==================== SENTENCE 55 ====================
ANALYSE PATTERNS: 'Is Santa Skipping Wal-Mart? Plus, few defectors in the wireless war, and Overstock's locked and loaded.'
============================================================

BERT:
   CLS attention max: 0.100
   Self-attention mean: 0.098
   Content/Function ratio: 1.74
   Attention entropy: 59.411

DistilBERT:
   CLS attention max: 0.176
   Self-attention mean: 0.089
   Content/Function ratio: 1.54
   Attention entropy: 65.050

RoBERTa:
   CLS attention max: 0.243
   Self-attention mean: 0.124
   Content/Function ratio: 1.69
   Attention entropy: 59.441

==================== SENTENCE 56 ====================
ANALYSE PATTERNS: 'Whole Foods' Healthy Outlook The natural foods chain is predicting double-digit sales growth until 2010.'
============================================================

BERT:
   CLS attention max: 0.105
   Self-attention mean: 0.118
   Content/Function ratio: 1.86
   Attention entropy: 40.640

DistilBERT:
   CLS attention max: 0.222
   Self-attention mean: 0.093
   Content/Function ratio: 1.57
   Attention entropy: 39.633

RoBERTa:
   CLS attention max: 0.285
   Self-attention mean: 0.152
   Content/Function ratio: 45.45
   Attention entropy: 41.706

==================== SENTENCE 57 ====================
ANALYSE PATTERNS: 'A License to Print Money Will Coinstar's diversification efforts damage its uniquely profitable business model?'
============================================================

BERT:
   CLS attention max: 0.151
   Self-attention mean: 0.118
   Content/Function ratio: 0.90
   Attention entropy: 36.281

DistilBERT:
   CLS attention max: 0.239
   Self-attention mean: 0.091
   Content/Function ratio: 0.57
   Attention entropy: 35.517

RoBERTa:
   CLS attention max: 0.336
   Self-attention mean: 0.155
   Content/Function ratio: 1.51
   Attention entropy: 40.065

==================== SENTENCE 58 ====================
ANALYSE PATTERNS: 'Palestinian economy in decline The Palestinian economy is in crisis, performing well below its potential, the World Bank says.'
============================================================

BERT:
   CLS attention max: 0.136
   Self-attention mean: 0.099
   Content/Function ratio: 2.20
   Attention entropy: 47.137

DistilBERT:
   CLS attention max: 0.200
   Self-attention mean: 0.078
   Content/Function ratio: 2.62
   Attention entropy: 50.624

RoBERTa:
   CLS attention max: 0.276
   Self-attention mean: 0.120
   Content/Function ratio: 41.67
   Attention entropy: 46.551

==================== SENTENCE 59 ====================
ANALYSE PATTERNS: 'Synnex's World Isn't Flat The company is moving along a dual track -- growth through acquisitions and organic efforts.'
============================================================

BERT:
   CLS attention max: 0.102
   Self-attention mean: 0.103
   Content/Function ratio: 2.46
   Attention entropy: 64.966

DistilBERT:
   CLS attention max: 0.163
   Self-attention mean: 0.079
   Content/Function ratio: 1.82
   Attention entropy: 67.154

RoBERTa:
   CLS attention max: 0.247
   Self-attention mean: 0.122
   Content/Function ratio: 40.00
   Attention entropy: 51.148

==================== SENTENCE 60 ====================
ANALYSE PATTERNS: 'Rising material costs hit Heinz Second quarter profits at ketchup maker Heinz are hit by higher material and transport costs.'
============================================================

BERT:
   CLS attention max: 0.132
   Self-attention mean: 0.120
   Content/Function ratio: 1.54
   Attention entropy: 49.509

DistilBERT:
   CLS attention max: 0.252
   Self-attention mean: 0.087
   Content/Function ratio: 1.71
   Attention entropy: 48.379

RoBERTa:
   CLS attention max: 0.282
   Self-attention mean: 0.126
   Content/Function ratio: 37.04
   Attention entropy: 51.436

==================== SENTENCE 61 ====================
ANALYSE PATTERNS: 'Profit From Management Integrity Laser manufacturer Candela's management avoids taking the easy way out to explain shortfalls.'
============================================================

BERT:
   CLS attention max: 0.107
   Self-attention mean: 0.111
   Content/Function ratio: 2.31
   Attention entropy: 50.228

DistilBERT:
   CLS attention max: 0.184
   Self-attention mean: 0.086
   Content/Function ratio: 2.02
   Attention entropy: 54.418

RoBERTa:
   CLS attention max: 0.260
   Self-attention mean: 0.157
   Content/Function ratio: 41.67
   Attention entropy: 48.239

==================== SENTENCE 62 ====================
ANALYSE PATTERNS: 'United Airlines imposes wage cuts America's second largest airline announces widespread pay cuts as it strives to emerge from bankruptcy.'
============================================================

BERT:
   CLS attention max: 0.145
   Self-attention mean: 0.095
   Content/Function ratio: 24.88
   Attention entropy: 50.851

DistilBERT:
   CLS attention max: 0.220
   Self-attention mean: 0.078
   Content/Function ratio: 30.40
   Attention entropy: 53.361

RoBERTa:
   CLS attention max: 0.288
   Self-attention mean: 0.135
   Content/Function ratio: 41.67
   Attention entropy: 46.866

==================== SENTENCE 63 ====================
ANALYSE PATTERNS: 'Is Disney a Growth Stock? Plus, Mel's Sirius decision, Phil hangs up his Nikes, and Mattel's "free plus" dividend.'
============================================================

BERT:
   CLS attention max: 0.072
   Self-attention mean: 0.085
   Content/Function ratio: 1.81
   Attention entropy: 84.554

DistilBERT:
   CLS attention max: 0.155
   Self-attention mean: 0.076
   Content/Function ratio: 2.46
   Attention entropy: 86.452

RoBERTa:
   CLS attention max: 0.231
   Self-attention mean: 0.113
   Content/Function ratio: 2.32
   Attention entropy: 68.335

==================== SENTENCE 64 ====================
ANALYSE PATTERNS: 'PDL Rakes It In A robust revenue stream combined with an exciting drug pipeline is the recipe for success.'
============================================================

BERT:
   CLS attention max: 0.104
   Self-attention mean: 0.107
   Content/Function ratio: 2.53
   Attention entropy: 50.046

DistilBERT:
   CLS attention max: 0.238
   Self-attention mean: 0.086
   Content/Function ratio: 2.42
   Attention entropy: 50.803

RoBERTa:
   CLS attention max: 0.287
   Self-attention mean: 0.152
   Content/Function ratio: 41.67
   Attention entropy: 48.186

==================== SENTENCE 65 ====================
ANALYSE PATTERNS: 'Emisphere Wins Novartis Over Emisphere warrants close attention, although massive profits are not in the near-term cards.'
============================================================

BERT:
   CLS attention max: 0.104
   Self-attention mean: 0.102
   Content/Function ratio: 3.47
   Attention entropy: 61.267

DistilBERT:
   CLS attention max: 0.237
   Self-attention mean: 0.079
   Content/Function ratio: 2.63
   Attention entropy: 61.966

RoBERTa:
   CLS attention max: 0.264
   Self-attention mean: 0.130
   Content/Function ratio: 2.60
   Attention entropy: 53.318

==================== SENTENCE 66 ====================
ANALYSE PATTERNS: 'Pricey Gas Stalls AutoZone The retailer posts a flat first quarter, claiming high gas prices affect consumers' car budgets.'
============================================================

BERT:
   CLS attention max: 0.086
   Self-attention mean: 0.119
   Content/Function ratio: 2.04
   Attention entropy: 55.576

DistilBERT:
   CLS attention max: 0.172
   Self-attention mean: 0.086
   Content/Function ratio: 1.70
   Attention entropy: 56.851

RoBERTa:
   CLS attention max: 0.265
   Self-attention mean: 0.135
   Content/Function ratio: 37.04
   Attention entropy: 55.092

==================== SENTENCE 67 ====================
ANALYSE PATTERNS: 'One Really Ugly Mark on Star Gas Inscribe this stock with "losing customers, bad debt terms, and, maybe, bankruptcy."'
============================================================

BERT:
   CLS attention max: 0.108
   Self-attention mean: 0.105
   Content/Function ratio: 2.69
   Attention entropy: 70.515

DistilBERT:
   CLS attention max: 0.179
   Self-attention mean: 0.082
   Content/Function ratio: 1.38
   Attention entropy: 74.483

RoBERTa:
   CLS attention max: 0.263
   Self-attention mean: 0.122
   Content/Function ratio: 33.33
   Attention entropy: 63.683

==================== SENTENCE 68 ====================
ANALYSE PATTERNS: 'High oil prices hit China growth Rising oil prices are expected to hit China's growth rate this year.'
============================================================

BERT:
   CLS attention max: 0.092
   Self-attention mean: 0.111
   Content/Function ratio: 1.80
   Attention entropy: 46.162

DistilBERT:
   CLS attention max: 0.245
   Self-attention mean: 0.084
   Content/Function ratio: 1.26
   Attention entropy: 45.522

RoBERTa:
   CLS attention max: 0.271
   Self-attention mean: 0.123
   Content/Function ratio: 45.45
   Attention entropy: 43.725

==================== SENTENCE 69 ====================
ANALYSE PATTERNS: 'Bowes Takes a Bow Pitney Bowes always seems to mail it in -- and that's not necessarily a bad thing.'
============================================================

BERT:
   CLS attention max: 0.130
   Self-attention mean: 0.090
   Content/Function ratio: 3.08
   Attention entropy: 51.124

DistilBERT:
   CLS attention max: 0.206
   Self-attention mean: 0.080
   Content/Function ratio: 2.54
   Attention entropy: 63.539

RoBERTa:
   CLS attention max: 0.266
   Self-attention mean: 0.129
   Content/Function ratio: 37.04
   Attention entropy: 52.537

==================== SENTENCE 70 ====================
ANALYSE PATTERNS: 'ADV: \$150,000 Mortgage for Under \$690/Month Mortgage rates are at record lows. Save \$1000s on your mortgage payment. Free quotes.'
============================================================

BERT:
   CLS attention max: 0.090
   Self-attention mean: 0.103
   Content/Function ratio: 1.56
   Attention entropy: 95.987

DistilBERT:
   CLS attention max: 0.144
   Self-attention mean: 0.072
   Content/Function ratio: 1.42
   Attention entropy: 88.378

RoBERTa:
   CLS attention max: 0.213
   Self-attention mean: 0.127
   Content/Function ratio: 26.32
   Attention entropy: 84.686

==================== SENTENCE 71 ====================
ANALYSE PATTERNS: 'Wal-Mart Sees Lackluster November Sales (Reuters) Reuters - Wal-Mart Stores Inc. , the\world's largest retailer, slashed its own expectations of'
============================================================

BERT:
   CLS attention max: 0.083
   Self-attention mean: 0.091
   Content/Function ratio: 3.09
   Attention entropy: 81.875

DistilBERT:
   CLS attention max: 0.142
   Self-attention mean: 0.070
   Content/Function ratio: 1.56
   Attention entropy: 90.060

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.126
   Content/Function ratio: 28.57
   Attention entropy: 76.071

==================== SENTENCE 72 ====================
ANALYSE PATTERNS: 'Don't Listen to Buffett Not all the time, anyway. Moneyball author Michael Lewis says conventional wisdom creates inefficiencies.'
============================================================

BERT:
   CLS attention max: 0.110
   Self-attention mean: 0.102
   Content/Function ratio: 5.55
   Attention entropy: 60.896

DistilBERT:
   CLS attention max: 0.141
   Self-attention mean: 0.082
   Content/Function ratio: 3.11
   Attention entropy: 78.527

RoBERTa:
   CLS attention max: 0.194
   Self-attention mean: 0.121
   Content/Function ratio: 37.04
   Attention entropy: 60.009

==================== SENTENCE 73 ====================
ANALYSE PATTERNS: 'In Pursuit of Happiness One Fool experiences more than two hours of lost pre-party productivity in a Barnes Noble quest.'
============================================================

BERT:
   CLS attention max: 0.118
   Self-attention mean: 0.098
   Content/Function ratio: 2.11
   Attention entropy: 48.448

DistilBERT:
   CLS attention max: 0.199
   Self-attention mean: 0.080
   Content/Function ratio: 2.01
   Attention entropy: 53.439

RoBERTa:
   CLS attention max: 0.255
   Self-attention mean: 0.135
   Content/Function ratio: 1.88
   Attention entropy: 54.234

==================== SENTENCE 74 ====================
ANALYSE PATTERNS: 'Middle Class America The Post's Jonathan Weisman discusses the increasing importance of temporary employment to the American economy.'
============================================================

BERT:
   CLS attention max: 0.099
   Self-attention mean: 0.111
   Content/Function ratio: 2.97
   Attention entropy: 42.892

DistilBERT:
   CLS attention max: 0.213
   Self-attention mean: 0.088
   Content/Function ratio: 1.98
   Attention entropy: 46.119

RoBERTa:
   CLS attention max: 0.285
   Self-attention mean: 0.134
   Content/Function ratio: 43.48
   Attention entropy: 45.155

==================== SENTENCE 75 ====================
ANALYSE PATTERNS: 'Stern and Letterman Get Sirius Stern has a date with Letterman tonight, and you can expect fireworks.'
============================================================

BERT:
   CLS attention max: 0.109
   Self-attention mean: 0.116
   Content/Function ratio: 3.30
   Attention entropy: 43.898

DistilBERT:
   CLS attention max: 0.183
   Self-attention mean: 0.094
   Content/Function ratio: 3.89
   Attention entropy: 45.688

RoBERTa:
   CLS attention max: 0.273
   Self-attention mean: 0.144
   Content/Function ratio: 41.67
   Attention entropy: 49.072

==================== SENTENCE 76 ====================
ANALYSE PATTERNS: 'Nortel to lay off 3,500 The scandal-beset company will also lay off about 10 percent of its work force.'
============================================================

BERT:
   CLS attention max: 0.091
   Self-attention mean: 0.111
   Content/Function ratio: 2.71
   Attention entropy: 52.716

DistilBERT:
   CLS attention max: 0.195
   Self-attention mean: 0.085
   Content/Function ratio: 1.89
   Attention entropy: 59.245

RoBERTa:
   CLS attention max: 0.282
   Self-attention mean: 0.129
   Content/Function ratio: 34.48
   Attention entropy: 56.232

==================== SENTENCE 77 ====================
ANALYSE PATTERNS: 'Microsoft Eyes Lighter Versions of Longhorn Operating systems would be designed for specific server tasks, company says.'
============================================================

BERT:
   CLS attention max: 0.107
   Self-attention mean: 0.114
   Content/Function ratio: 2.03
   Attention entropy: 46.024

DistilBERT:
   CLS attention max: 0.174
   Self-attention mean: 0.092
   Content/Function ratio: 2.36
   Attention entropy: 45.003

RoBERTa:
   CLS attention max: 0.275
   Self-attention mean: 0.117
   Content/Function ratio: 41.67
   Attention entropy: 50.254

==================== SENTENCE 78 ====================
ANALYSE PATTERNS: 'Tokyo Edge: New Choices in Digital Entertainment PC and home theater make sleek package, while new portable music devices abound.'
============================================================

BERT:
   CLS attention max: 0.099
   Self-attention mean: 0.098
   Content/Function ratio: 1.96
   Attention entropy: 57.856

DistilBERT:
   CLS attention max: 0.121
   Self-attention mean: 0.083
   Content/Function ratio: 1.84
   Attention entropy: 60.955

RoBERTa:
   CLS attention max: 0.255
   Self-attention mean: 0.104
   Content/Function ratio: 37.04
   Attention entropy: 58.760

==================== SENTENCE 79 ====================
ANALYSE PATTERNS: 'First Look: Skip Gateway's MP3 Photo Jukebox Color display and photo support can't save oddly designed player.'
============================================================

BERT:
   CLS attention max: 0.098
   Self-attention mean: 0.098
   Content/Function ratio: 28.83
   Attention entropy: 54.808

DistilBERT:
   CLS attention max: 0.135
   Self-attention mean: 0.087
   Content/Function ratio: 35.57
   Attention entropy: 64.080

RoBERTa:
   CLS attention max: 0.245
   Self-attention mean: 0.112
   Content/Function ratio: 38.46
   Attention entropy: 56.762

==================== SENTENCE 80 ====================
ANALYSE PATTERNS: 'Reg readers name BSA antipiracy weasel Poll result The people have spoken'
============================================================

BERT:
   CLS attention max: 0.114
   Self-attention mean: 0.150
   Content/Function ratio: 0.96
   Attention entropy: 28.237

DistilBERT:
   CLS attention max: 0.281
   Self-attention mean: 0.109
   Content/Function ratio: 0.79
   Attention entropy: 28.323

RoBERTa:
   CLS attention max: 0.359
   Self-attention mean: 0.147
   Content/Function ratio: 55.56
   Attention entropy: 32.112

==================== SENTENCE 81 ====================
ANALYSE PATTERNS: 'GeekTech: Here Comes BTX New industry standard should offer cooler, quieter systems--so why isn't anybody rushing to embrace it?'
============================================================

BERT:
   CLS attention max: 0.117
   Self-attention mean: 0.090
   Content/Function ratio: 22.79
   Attention entropy: 62.549

DistilBERT:
   CLS attention max: 0.146
   Self-attention mean: 0.086
   Content/Function ratio: 23.51
   Attention entropy: 68.407

RoBERTa:
   CLS attention max: 0.308
   Self-attention mean: 0.141
   Content/Function ratio: 33.33
   Attention entropy: 59.569

==================== SENTENCE 82 ====================
ANALYSE PATTERNS: 'Microsoft Readies Windows Server 2003 Update R2, an interim release, will begin beta testing later this month.'
============================================================

BERT:
   CLS attention max: 0.093
   Self-attention mean: 0.109
   Content/Function ratio: 3.68
   Attention entropy: 52.107

DistilBERT:
   CLS attention max: 0.191
   Self-attention mean: 0.086
   Content/Function ratio: 4.77
   Attention entropy: 49.449

RoBERTa:
   CLS attention max: 0.277
   Self-attention mean: 0.130
   Content/Function ratio: 41.67
   Attention entropy: 49.618

==================== SENTENCE 83 ====================
ANALYSE PATTERNS: 'Microsoft sees bespoke Windows everywhere Analysis Premium hand-tuning service available'
============================================================

BERT:
   CLS attention max: 0.174
   Self-attention mean: 0.139
   Content/Function ratio: 30.11
   Attention entropy: 28.086

DistilBERT:
   CLS attention max: 0.282
   Self-attention mean: 0.115
   Content/Function ratio: 28.85
   Attention entropy: 27.557

RoBERTa:
   CLS attention max: 0.383
   Self-attention mean: 0.143
   Content/Function ratio: 62.50
   Attention entropy: 30.654

==================== SENTENCE 84 ====================
ANALYSE PATTERNS: 'Virus targets 64-bit Windows Digital pest prototype infects files only found in early Windows code for AMD 64-bit Opteron processors.'
============================================================

BERT:
   CLS attention max: 0.123
   Self-attention mean: 0.095
   Content/Function ratio: 1.88
   Attention entropy: 64.211

DistilBERT:
   CLS attention max: 0.186
   Self-attention mean: 0.080
   Content/Function ratio: 2.11
   Attention entropy: 67.948

RoBERTa:
   CLS attention max: 0.234
   Self-attention mean: 0.111
   Content/Function ratio: 2.53
   Attention entropy: 67.480

==================== SENTENCE 85 ====================
ANALYSE PATTERNS: 'Sony camera blends photos, video The company's latest digital camera includes advanced video features.'
============================================================

BERT:
   CLS attention max: 0.130
   Self-attention mean: 0.113
   Content/Function ratio: 1.80
   Attention entropy: 40.820

DistilBERT:
   CLS attention max: 0.229
   Self-attention mean: 0.086
   Content/Function ratio: 2.22
   Attention entropy: 42.075

RoBERTa:
   CLS attention max: 0.306
   Self-attention mean: 0.124
   Content/Function ratio: 52.63
   Attention entropy: 36.942

==================== SENTENCE 86 ====================
ANALYSE PATTERNS: 'Triumphant return of the big Reg logo t-shirt Cash'n'Carrion Cue trumpets, etc'
============================================================

BERT:
   CLS attention max: 0.150
   Self-attention mean: 0.118
   Content/Function ratio: 1.01
   Attention entropy: 39.601

DistilBERT:
   CLS attention max: 0.231
   Self-attention mean: 0.100
   Content/Function ratio: 1.13
   Attention entropy: 43.212

RoBERTa:
   CLS attention max: 0.337
   Self-attention mean: 0.130
   Content/Function ratio: 38.46
   Attention entropy: 51.250

==================== SENTENCE 87 ====================
ANALYSE PATTERNS: 'Worms may slow Parkinson's A protein which helps increase lifespan in worms offers hope for new Parkinson's and Alzheimer's treatments.'
============================================================

BERT:
   CLS attention max: 0.110
   Self-attention mean: 0.101
   Content/Function ratio: 2.27
   Attention entropy: 62.052

DistilBERT:
   CLS attention max: 0.204
   Self-attention mean: 0.078
   Content/Function ratio: 3.67
   Attention entropy: 67.003

RoBERTa:
   CLS attention max: 0.268
   Self-attention mean: 0.142
   Content/Function ratio: 35.71
   Attention entropy: 57.050

==================== SENTENCE 88 ====================
ANALYSE PATTERNS: 'FCC Moves Toward Voice, Data, Broadband on Planes Agency to auction licenses for communications; consider cell phone use during flights.'
============================================================

BERT:
   CLS attention max: 0.127
   Self-attention mean: 0.113
   Content/Function ratio: 1.03
   Attention entropy: 51.383

DistilBERT:
   CLS attention max: 0.201
   Self-attention mean: 0.091
   Content/Function ratio: 2.38
   Attention entropy: 54.816

RoBERTa:
   CLS attention max: 0.247
   Self-attention mean: 0.102
   Content/Function ratio: 34.48
   Attention entropy: 65.139

==================== SENTENCE 89 ====================
ANALYSE PATTERNS: 'Thumb twiddling on cybersecurity Congresswoman Zoe Lofgren says bureaucratic miscues continue to hamstring serious government action to combat cyberattacks.'
============================================================

BERT:
   CLS attention max: 0.120
   Self-attention mean: 0.092
   Content/Function ratio: 1.82
   Attention entropy: 76.549

DistilBERT:
   CLS attention max: 0.209
   Self-attention mean: 0.071
   Content/Function ratio: 1.34
   Attention entropy: 80.111

RoBERTa:
   CLS attention max: 0.251
   Self-attention mean: 0.118
   Content/Function ratio: 3.31
   Attention entropy: 59.397

==================== SENTENCE 90 ====================
ANALYSE PATTERNS: 'Last Xmas order date for the Antipodes Cash'n'Carrion Get 'em in by Sunday'
============================================================

BERT:
   CLS attention max: 0.200
   Self-attention mean: 0.127
   Content/Function ratio: 0.91
   Attention entropy: 43.208

DistilBERT:
   CLS attention max: 0.289
   Self-attention mean: 0.104
   Content/Function ratio: 0.85
   Attention entropy: 44.560

RoBERTa:
   CLS attention max: 0.358
   Self-attention mean: 0.136
   Content/Function ratio: 40.00
   Attention entropy: 47.427

==================== SENTENCE 91 ====================
ANALYSE PATTERNS: 'Alleged Apple Flash iPod 'partner' signs with Rio SigmaTel's chips claimed to have won Apple's support'
============================================================

BERT:
   CLS attention max: 0.152
   Self-attention mean: 0.111
   Content/Function ratio: 17.41
   Attention entropy: 48.011

DistilBERT:
   CLS attention max: 0.249
   Self-attention mean: 0.085
   Content/Function ratio: 20.24
   Attention entropy: 49.751

RoBERTa:
   CLS attention max: 0.327
   Self-attention mean: 0.147
   Content/Function ratio: 38.46
   Attention entropy: 53.579

==================== SENTENCE 92 ====================
ANALYSE PATTERNS: 'Older Windows OSes need critical patch Microsoft releases critical Explorer patch VNUNet.com'
============================================================

BERT:
   CLS attention max: 0.162
   Self-attention mean: 0.128
   Content/Function ratio: 22.65
   Attention entropy: 33.761

DistilBERT:
   CLS attention max: 0.266
   Self-attention mean: 0.110
   Content/Function ratio: 20.40
   Attention entropy: 29.918

RoBERTa:
   CLS attention max: 0.254
   Self-attention mean: 0.134
   Content/Function ratio: 47.62
   Attention entropy: 45.713

==================== SENTENCE 93 ====================
ANALYSE PATTERNS: 'Photo: XM's portable satellite radio XM Satellite Radio Holdings introduced a handheld portable version of its satellite radio.'
============================================================

BERT:
   CLS attention max: 0.117
   Self-attention mean: 0.102
   Content/Function ratio: 2.43
   Attention entropy: 51.215

DistilBERT:
   CLS attention max: 0.172
   Self-attention mean: 0.082
   Content/Function ratio: 2.18
   Attention entropy: 55.097

RoBERTa:
   CLS attention max: 0.288
   Self-attention mean: 0.121
   Content/Function ratio: 40.00
   Attention entropy: 50.175

==================== SENTENCE 94 ====================
ANALYSE PATTERNS: 'US cyber security chief resigns The man charged with making US computer networks safer has resigned suddenly.'
============================================================

BERT:
   CLS attention max: 0.172
   Self-attention mean: 0.128
   Content/Function ratio: 1.65
   Attention entropy: 40.506

DistilBERT:
   CLS attention max: 0.209
   Self-attention mean: 0.101
   Content/Function ratio: 1.68
   Attention entropy: 41.638

RoBERTa:
   CLS attention max: 0.296
   Self-attention mean: 0.132
   Content/Function ratio: 47.62
   Attention entropy: 40.024

==================== SENTENCE 95 ====================
ANALYSE PATTERNS: 'McAfee Enhances Spyware Protection (PC World) PC World - Antivirus company offers improved anti-spyware app to users for a fee.'
============================================================

BERT:
   CLS attention max: 0.118
   Self-attention mean: 0.122
   Content/Function ratio: 2.30
   Attention entropy: 76.505

DistilBERT:
   CLS attention max: 0.213
   Self-attention mean: 0.078
   Content/Function ratio: 2.50
   Attention entropy: 73.708

RoBERTa:
   CLS attention max: 0.251
   Self-attention mean: 0.144
   Content/Function ratio: 29.41
   Attention entropy: 75.294

==================== SENTENCE 96 ====================
ANALYSE PATTERNS: 'Arm Holdings buys US tech firm Microprocessor designer Arm Holdings buys US tech firm Artisan for about \$913m.'
============================================================

BERT:
   CLS attention max: 0.103
   Self-attention mean: 0.096
   Content/Function ratio: 28.26
   Attention entropy: 57.627

DistilBERT:
   CLS attention max: 0.190
   Self-attention mean: 0.087
   Content/Function ratio: 28.14
   Attention entropy: 56.872

RoBERTa:
   CLS attention max: 0.278
   Self-attention mean: 0.124
   Content/Function ratio: 37.04
   Attention entropy: 51.233

==================== SENTENCE 97 ====================
ANALYSE PATTERNS: 'Outsourcing Finds Vietnam Vietnam is making a big push to turn itself into an outsourcing powerhouse.'
============================================================

BERT:
   CLS attention max: 0.115
   Self-attention mean: 0.106
   Content/Function ratio: 2.41
   Attention entropy: 46.226

DistilBERT:
   CLS attention max: 0.245
   Self-attention mean: 0.086
   Content/Function ratio: 2.36
   Attention entropy: 44.096

RoBERTa:
   CLS attention max: 0.290
   Self-attention mean: 0.138
   Content/Function ratio: 45.45
   Attention entropy: 44.414

==================== SENTENCE 98 ====================
ANALYSE PATTERNS: 'Watchdog attacks ID card scheme Proposals for identity cards and a population register are opposed by Britain's information watchdog.'
============================================================

BERT:
   CLS attention max: 0.096
   Self-attention mean: 0.109
   Content/Function ratio: 1.74
   Attention entropy: 53.928

DistilBERT:
   CLS attention max: 0.239
   Self-attention mean: 0.078
   Content/Function ratio: 1.73
   Attention entropy: 50.667

RoBERTa:
   CLS attention max: 0.279
   Self-attention mean: 0.135
   Content/Function ratio: 38.46
   Attention entropy: 53.603

==================== SENTENCE 99 ====================
ANALYSE PATTERNS: 'Salesforce.com reports subscriber surge The subscription software company adds 85,000 individual subscribers to its online customer information system.'
============================================================

BERT:
   CLS attention max: 0.098
   Self-attention mean: 0.106
   Content/Function ratio: 2.21
   Attention entropy: 61.773

DistilBERT:
   CLS attention max: 0.219
   Self-attention mean: 0.072
   Content/Function ratio: 1.80
   Attention entropy: 58.843

RoBERTa:
   CLS attention max: 0.287
   Self-attention mean: 0.140
   Content/Function ratio: 38.46
   Attention entropy: 54.408

==================== SENTENCE 100 ====================
ANALYSE PATTERNS: 'Will historic flight launch space tourism? Regardless, space competitions are poised to become big business.'
============================================================

BERT:
   CLS attention max: 0.091
   Self-attention mean: 0.118
   Content/Function ratio: 2.11
   Attention entropy: 43.097

DistilBERT:
   CLS attention max: 0.189
   Self-attention mean: 0.093
   Content/Function ratio: 1.71
   Attention entropy: 42.230

RoBERTa:
   CLS attention max: 0.285
   Self-attention mean: 0.132
   Content/Function ratio: 50.00
   Attention entropy: 39.187
In [10]:
def visualize_attention_patterns(all_patterns, sentence_domains):
    """Visualize attention patterns across models and sentences"""

    metrics = ["cls_attention_max", "self_attention_mean", "content_vs_function", "attention_entropy"]
    metric_names = ["CLS Attention Max", "Self-Attention Mean", "Content/Function Ratio", "Entropy"]
    models = list(models_data.keys())
    sentence_keys = list(all_patterns.keys())

    # 1. GRAPHICS BY METRIC (overall averages per model)
    print("\n OVERALL AVERAGE PATTERNS:")
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=metric_names,
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "bar"}]]
    )
    positions = [(1,1), (1,2), (2,1), (2,2)]
    
    for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
        row, col = positions[idx]
        model_means = {}
        for model in models:
            values = [all_patterns[sentence_key][model][metric] for sentence_key in sentence_keys]
            model_means[model] = np.mean(values)
        
        colors = [MODELS[model]["color"] for model in models]
        fig.add_trace(
            go.Bar(
                x=list(model_means.keys()),
                y=list(model_means.values()),
                marker_color=colors,
                name=metric_name,
                showlegend=False
            ),
            row=row, col=col
        )
    
    fig.update_layout(
        title_text="Attention Patterns per Model (Overall Averages)",
        title_x=0.5,
        height=600
    )
    
    fig.show()

    # 2. TABLE BY DOMAIN
    print("\n TABLE BY DOMAIN:")

    domain_data = []
    for sentence_key, domain in zip(sentence_keys, sentence_domains):
        for model in models:
            for metric in metrics:
                domain_data.append({
                    "Domain": domain,
                    "Model": model,
                    "Metric": metric,
                    "Value": all_patterns[sentence_key][model][metric]
                })
    
    domain_df = pd.DataFrame(domain_data)
    
    pivot_table = domain_df.pivot_table(
        values='Value', 
        index=['Domain', 'Model'], 
        columns='Metric', 
        aggfunc='mean'
    ).round(3)
    
    print("\nAverage metrics by domain and model:")
    display(pivot_table)

    # 3. TABLE BY SENTENCES
    print("\n TABLE BY SENTENCES:")
    pattern_data = []
    for i, (sentence_key, domain) in enumerate(zip(sentence_keys, sentence_domains)):
        for model in models:
            row = {
                "Sentence": f"S{i+1}",
                "Domain": domain,
                "Model": model
            }
            for metric in metrics:
                row[metric] = f"{all_patterns[sentence_key][model][metric]:.3f}"
            pattern_data.append(row)

    pattern_df = pd.DataFrame(pattern_data)
    display(pattern_df)

    # 4. COMPARATIVE CHART BY DOMAIN
    print("\n DOMAIN COMPARISON:")
    fig_domain = make_subplots(
        rows=2, cols=2,
        subplot_titles=metric_names,
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "bar"}]]
    )
    
    unique_domains = sorted(set(sentence_domains))
    
    for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
        row, col = positions[idx]
        
        for model in models:
            domain_means = []
            for domain in unique_domains:
                # Filter data for this domain
                domain_values = [
                    all_patterns[sentence_key][model][metric] 
                    for sentence_key, d in zip(sentence_keys, sentence_domains) 
                    if d == domain
                ]
                domain_means.append(np.mean(domain_values))
            
            fig_domain.add_trace(
                go.Bar(
                    x=unique_domains,
                    y=domain_means,
                    name=model,
                    marker_color=MODELS[model]["color"],
                    showlegend=(idx == 0) # Show legend only in first subplot
                ),
                row=row, col=col
            )
    
    fig_domain.update_layout(
        title_text="Attention Patterns by Domain",
        title_x=0.5,
        height=700,
        barmode='group'
    )
    
    fig_domain.show()
    
    return fig, pattern_df, pivot_table

patterns_fig, patterns_df, domain_pivot = visualize_attention_patterns(all_patterns, SENTENCE_DOMAINS)
 OVERALL AVERAGE PATTERNS:
 TABLE BY DOMAIN:

Average metrics by domain and model:
Metric attention_entropy cls_attention_max content_vs_function self_attention_mean
Domain Model
Business BERT 55.365 0.111 3.344 0.105
DistilBERT 58.253 0.197 3.143 0.083
RoBERTa 53.341 0.267 31.708 0.133
Sci/Tech BERT 50.914 0.123 7.479 0.112
DistilBERT 52.223 0.211 7.841 0.089
RoBERTa 51.451 0.289 38.574 0.129
Sports BERT 61.612 0.106 2.803 0.101
DistilBERT 65.759 0.173 2.013 0.083
RoBERTa 59.828 0.269 27.725 0.130
World BERT 55.811 0.127 5.519 0.104
DistilBERT 57.042 0.182 5.542 0.082
RoBERTa 54.705 0.274 33.588 0.128
 TABLE BY SENTENCES:
Sentence Domain Model cls_attention_max self_attention_mean content_vs_function attention_entropy
0 S1 World BERT 0.108 0.088 1.349 61.730
1 S1 World DistilBERT 0.171 0.070 0.888 62.514
2 S1 World RoBERTa 0.281 0.128 3.808 60.199
3 S2 World BERT 0.102 0.112 1.587 53.609
4 S2 World DistilBERT 0.196 0.083 2.178 50.663
... ... ... ... ... ... ... ...
295 S99 Sci/Tech DistilBERT 0.219 0.072 1.799 58.843
296 S99 Sci/Tech RoBERTa 0.287 0.140 38.462 54.408
297 S100 Sci/Tech BERT 0.091 0.118 2.107 43.097
298 S100 Sci/Tech DistilBERT 0.189 0.093 1.711 42.230
299 S100 Sci/Tech RoBERTa 0.285 0.132 50.000 39.187

300 rows × 7 columns

 DOMAIN COMPARISON:

Results Analysis¶

CLS Attention:

  • RoBERTa
  • DistilBERT
  • BERT

Self-Attention:

  • BERT
  • RoBERTa
  • DistilBERT

Content/Function Ratio:

  • RoBERTa
  • BERT
  • DistilBERT

Attention Entropy:

  • RoBERTa
  • BERT
  • DistilBERT

Key Observations¶

In [11]:
import time
import psutil
import gc

def benchmark_model_performance(models_data, test_sentences):
    """Benchmark speed and memory usage of models."""
    
    print("BENCHMARK PERFORMANCE MODELS")
    print("=" * 50)
    
    results = []
    
    for model_name, data in models_data.items():
        print(f"\nTest {model_name}...")
        
        tokenizer = data["tokenizer"]
        model = data["model"]
        times = []
        memory_usage = []
        
        for sentence in test_sentences:   
            gc.collect()
            memory_before = psutil.Process().memory_info().rss / 1024 / 1024  # MB
            start_time = time.time()
            
            with torch.no_grad():
                inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
                outputs = model(**inputs)
            
            end_time = time.time()
            inference_time = end_time - start_time
            times.append(inference_time)
            
            memory_after = psutil.Process().memory_info().rss / 1024 / 1024  # MB
            memory_usage.append(memory_after - memory_before)
        
        avg_time = np.mean(times)
        std_time = np.std(times)
        avg_memory = np.mean(memory_usage)
        
        total_params = sum(p.numel() for p in model.parameters())
        
        result = {
            "Model": model_name,
            "Avg Time (ms)": f"{avg_time*1000:.2f}",
            "Écart-type (ms)": f"{std_time*1000:.2f}",
            "Memory (MB)": f"{avg_memory:.1f}",
            "Parameters": f"{total_params:,}",
            "Speed Relative": 1.0  # Will be calculated later
        }
        
        results.append(result)

        print(f"   Avg Time: {avg_time*1000:.2f} ms")
        print(f"   Memory: {avg_memory:.1f} MB")
        print(f"   Parameters: {total_params:,}")

    # Speed relative (BERT = baseline)
    bert_time = float(results[0]["Avg Time (ms)"].replace(' ms', ''))
    for result in results:
        model_time = float(result["Avg Time (ms)"].replace(' ms', ''))
        result["Speed Relative"] = f"{bert_time/model_time:.2f}x"

    return pd.DataFrame(results)

performance_df = benchmark_model_performance(models_data, TEST_SENTENCES)
print("\n BENCHMARK RESULTS:")
display(performance_df)
BENCHMARK PERFORMANCE MODELS
==================================================

Test BERT...
   Avg Time: 44.25 ms
   Memory: 0.5 MB
   Parameters: 109,482,240

Test DistilBERT...
   Avg Time: 22.20 ms
   Memory: 0.0 MB
   Parameters: 66,362,880

Test RoBERTa...
   Avg Time: 44.05 ms
   Memory: 0.4 MB
   Parameters: 124,645,632

 BENCHMARK RESULTS:
Model Avg Time (ms) Écart-type (ms) Memory (MB) Parameters Speed Relative
0 BERT 44.25 3.65 0.5 109,482,240 1.00x
1 DistilBERT 22.20 0.98 0.0 66,362,880 1.99x
2 RoBERTa 44.05 2.67 0.4 124,645,632 1.00x
In [12]:
def create_performance_charts(performance_df):
    """Create performance comparison charts"""
    
    models = performance_df["Model"].values
    times = [float(x.replace(' ms', '')) for x in performance_df["Avg Time (ms)"].values]
    memory = [float(x.replace(' MB', '')) for x in performance_df["Memory (MB)"].values]
    params = [int(x.replace(',', '')) for x in performance_df["Parameters"].values]
    
    colors = [MODELS[model]["color"] for model in models]
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=("Inference Time", "Memory Usage", 
                       "Parameter Count", "Speed vs Parameters"),
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "scatter"}]]
    )
    
    # Chart 1: Time
    fig.add_trace(
        go.Bar(x=models, y=times, marker_color=colors, name="Time", showlegend=False),
        row=1, col=1
    )
    # Chart 2: Memory
    fig.add_trace(
        go.Bar(x=models, y=memory, marker_color=colors, name="Memory", showlegend=False),
        row=1, col=2
    )
    # Chart 3: Parameters
    fig.add_trace(
        go.Bar(x=models, y=params, marker_color=colors, name="Parameters", showlegend=False),
        row=2, col=1
    )
    # Chart 4: Trade-off
    fig.add_trace(
        go.Scatter(
            x=times, y=params, 
            mode='markers+text',
            marker=dict(size=15, color=colors),
            text=models,
            textposition="top center",
            name="Trade-off",
            showlegend=False
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title_text="Transformer Models Performance",
        title_x=0.5,
        height=800,
        showlegend=False
    )
    fig.update_xaxes(title_text="Models", row=1, col=1)
    fig.update_xaxes(title_text="Models", row=1, col=2)
    fig.update_xaxes(title_text="Models", row=2, col=1)
    fig.update_xaxes(title_text="Time (ms)", row=2, col=2)
    
    fig.update_yaxes(title_text="Time (ms)", row=1, col=1)
    fig.update_yaxes(title_text="Memory (MB)", row=1, col=2)
    fig.update_yaxes(title_text="Parameters", row=2, col=1)
    fig.update_yaxes(title_text="Parameters", row=2, col=2)
    
    fig.show()
    return fig

performance_fig = create_performance_charts(performance_df)

Summary¶

We compared attention mechanisms across three Transformer architectures and found distinct patterns for each.

1. RoBERTa

2. DistilBERT

3. BERT

When to Use Each Model¶

  • High performance tasks → RoBERTa
  • Resource constraints → DistilBERT
  • General purpose/exploration → BERT

Model Signatures¶

Model CLS Agg. Self-Att. Ratio C/F Entropy Speed
BERT 1.00x
DistilBERT
RoBERTa
In [14]:
import os
os.makedirs("results", exist_ok=True)

# 1. Sentences from the dataset
sentences_df = pd.DataFrame({
    "Sentence_ID": [f"S{i+1}" for i in range(len(TEST_SENTENCES))],
    "Domain": SENTENCE_DOMAINS,
    "Sentence": TEST_SENTENCES,
    "Word_Count": [len(s.split()) for s in TEST_SENTENCES]
})
sentences_df.to_csv("results/dataset_sentences.csv", index=False)

# 2. Metrics by domain
domain_pivot.to_csv("results/metrics_by_domain.csv")

# 3. Metrics by sentence
sentence_metrics = []
for i, (sentence_key, domain) in enumerate(zip(all_patterns.keys(), SENTENCE_DOMAINS)):
    for model in models_data.keys():
        row = {
            "Sentence_ID": f"S{i+1}",
            "Domain": domain,
            "Model": model,
            "CLS_Attention_Max": all_patterns[sentence_key][model]["cls_attention_max"],
            "CLS_Attention_Mean": all_patterns[sentence_key][model]["cls_attention_mean"],
            "Self_Attention_Mean": all_patterns[sentence_key][model]["self_attention_mean"],
            "Content_Function_Ratio": all_patterns[sentence_key][model]["content_vs_function"],
            "Attention_Entropy": all_patterns[sentence_key][model]["attention_entropy"]
        }
        sentence_metrics.append(row)
sentence_metrics_df = pd.DataFrame(sentence_metrics)
sentence_metrics_df.to_csv("results/metrics_by_sentence.csv", index=False)